library(tidyverse)
library(lmtest); library(sandwich)
library(foreach); library(doParallel)
select = dplyr::select
filter = dplyr::filter


#-------------------------------------------------------------------------#
# ols ---------------------------------------------------------------------
#-------------------------------------------------------------------------#

#--------------------------#
# ols fit -----------------#
#--------------------------#

fn_fit_ols = function(df, xvar, yvar) {
  # this function simply returns a ols fit object
  
  dt = df %>% 
    select(all_of(c(xvar, yvar))) %>%
    filter(across(
      all_of(c(yvar, xvar)), 
      list(~ !is.infinite(.), ~ !is.nan(.), ~ !is.na(.))
    )) %>% 
    # mutiple y by 100
    mutate_at(all_of(yvar), ~ .*100) %>%
    # standardize x
    mutate_at(all_of(xvar), ~ (. - mean(.))/sd(.) )
  
  fn = paste(yvar, '~ .')
  fit = lm(fn, dt)
  
  return(fit)
}



#--------------------------#
# os prediction -----------#
#--------------------------#

fn_os_ols = function(df, xvar, yvar, ini_win, fixed_win, ncore = 5) {
  
  # This function runs OOS forecasts using OLS
  
  # INPUT:
  #   + <df>        = data frame [date | <yvar> target | <xvar> predictor]
  #   + <ini_win>   = initial training window
  #   + <fixed_win> = TRUE <rolling window> or FALSE <expanding window>
  #   + <ncore>     = number of cores to use in parallezation
  
  # OUTPUT: a list containing
  #   + <r2>        = OOS R2 computed using the whole evaluation sample
  #   + <y>         = data frame [date | y_test <actual yvar> | 
  #                    y_hat <yvar forecast with xvar> | 
  #                    y_bar <yvar foreast with historical mean>]
  
  # NOTE: <yvar> MUST be named as "--h" with "h" indicating 
  #       the number of overlapping periods; the code ajusts the training window to   
  #       avoid look-ahead bias
  
  dt = df %>%
    filter(across(
        all_of(c(yvar, xvar)), 
        list(~ !is.infinite(.), ~ !is.nan(.), ~ !is.na(.))
    ))

  h  = str_extract_all(yvar, '\\d+$') %>% as.integer()
  n  = nrow(dt)
  
  
  
  library(foreach); library(doParallel)
  registerDoParallel(ncore) # set the number of cores to use

  out = foreach(

    i         = (ini_win+h):n,
    .packages = c('tidyverse', 'zoo')

  ) %dopar%

  {
    if (fixed_win == TRUE) {
      # use rolling window
      dt_train = dt[(i-ini_win-h+1):(i-h),]
    } else {
      # use expanding window
      dt_train = dt[1:(i-h),] # leave out h periods between train and test
    }
    
    x_train = dt_train[xvar] %>% as.matrix()
    y_train = dt_train[[yvar]]
    
    
    dt_test = dt[i,]
    x_test = dt_test[xvar] %>% unlist()
    y_test = dt_test[[yvar]]
    
    
    # predict y using historical mean
    y_bar  = mean(y_train)
    
    
    # predict y using ols
    coefs               = lm(y_train ~ x_train)$coefficients
    # because of linearity, some of coefs are NA -> set to 0
    coefs[is.na(coefs)] = 0 
    y_hat               = coefs %*% c(1, x_test)
    
    
    return(c(y_test, y_hat, y_bar))
  }
  
  stopImplicitCluster()
  
  
  y = do.call(rbind, out) %>% 
    as.data.frame() %>% 
    set_names(c('y_test','y_hat','y_bar')) %>% 
    as_tibble()
  
  y = cbind(dt[(ini_win+h):n, 1], y) # get date
  
  os_r2 = 1 - sum((y$y_test - y$y_hat)^2) / sum((y$y_test - y$y_bar)^2)
  
  
  out = list(r2 = os_r2, y = y)
  
  return(out)
}



#-------------------------------------------------------------------------#
# pls ---------------------------------------------------------------------
#-------------------------------------------------------------------------#

#---------------------------------------#
# construct pls: full sample -----------#
#---------------------------------------#

fn_construct_pls_full = function(df, xvar, yvar, ma_win = 1, scale = F) {
  
  dt = df %>% 
    filter(across(
      all_of(c(yvar, xvar)), 
      list(~ !is.infinite(.), ~ !is.nan(.), ~ !is.na(.))
    ))
  
  x  = dt[xvar] %>% as.matrix()
  # standardize x variables
  if (scale == TRUE) {
    x = apply(x, 2, function(v) (v - mean(v)) / sd(v))
  }
  
  y  = dt[[yvar]]
  
  # first step: regress each x on y to get the coefficients
  pi = sapply(1:ncol(x), function(i) coef(lm(x[,i] ~ y))[2])%>% unname()
  
  # second step: at each time t, regress x on pi to obtain x_pls
  x_pls = sapply(1:nrow(x), function(i) coef(lm(x[i,] ~ pi))[2]) %>% unname()
  
  x_pls = x_pls %>% rollmean(k = ma_win, fill = NA, align = 'right')
  
  
  out = list(
    w = pi %>% setNames(xvar),
    x_pls = data.frame(date = dt[,1], x_pls = x_pls) %>% as_tibble()
  )
  
  return(out)
}



#---------------------------------------#
# construct pls: recusive sample -------#
#---------------------------------------#

fn_construct_pls_recursive = function(df, xvar, yvar, ini_win, fixed_win) {
  # contructs x_pls in (t+1) recursively using info up to only (t)
  
  # for each yvar, have to leave h periods between train and test sample to avoid 
  # look-ahead bias
  h = str_extract_all(yvar, '\\d+') %>% as.integer()
  
  dt = df %>%
    filter(across(
      all_of(c(yvar, xvar)), 
      list(~ !is.infinite(.), ~ !is.nan(.), ~ !is.na(.))
    ))
  
  n  = nrow(dt)
  
  
  x_pls = sapply((ini_win+h):n, function(i){
    
    if (fixed_win == TRUE) {
      # use rolling window
      dt_train = dt[(i-ini_win-h+1):(i-h),]
    } else {
      # use expanding window
      dt_train = dt[1:(i-h),] # leave out h periods between train and test
    }
    
    x_train = dt_train[xvar] %>% as.matrix()
    y_train = dt_train[[yvar]]
    
    x_test  = dt[i, xvar] %>% unlist()
    
    
    # first step: regress each x on y to get the coefficients
    pi = sapply(1:ncol(x_train), function(i)coef(lm(x_train[,i] ~ y_train))[2])
    
    # second step: regress x_test on pi 
    x_pls = coef( lm(x_test ~ pi) )[2]
    
    return(x_pls)
  })
  
  out = list(
    x_pls = data.frame(
      date  = dt[(ini_win+h):n, 1], 
      x_pls = x_pls
    ) %>% as_tibble()
  )
  
  return(out)
}



#---------------------------------------#
# os prediction ------------------------#
#---------------------------------------#

fn_os_pls = function(df, xvar, yvar, ini_win, fixed_win, 
                     ma_win = 1, scale = F, ncore = 24) {

  # This function runs OOS forecasts using PLS (partial least square)
  
  # INPUT:
  #   + <df>        = data frame [date | <yvar> target | <xvar> predictor]
  #   + <ini_win>   = initial training window
  #   + <fixed_win> = TRUE <rolling window> or FALSE <expanding window>
  #   + <ncore>     = number of cores to use in parallezation
  
  # OUTPUT: a list containing
  #   + <r2>        = OOS R2 computed using the whole evaluation sample
  #   + <y>         = data frame [date | y_test <actual yvar> | 
  #                    y_hat <yvar forecast with xvar> | 
  #                    y_bar <yvar foreast with historical mean>]
  
  # NOTE: <yvar> MUST be named as "--h" with "h" indicating 
  #       the number of overlapping periods; the code ajusts the training window to   
  #       avoid look-ahead bias
  
  h  = str_extract_all(yvar, '\\d+$') %>% as.integer()
  dt = df %>% 
    filter(across(
      all_of(c(yvar, xvar)), 
      list(~ !is.infinite(.), ~ !is.nan(.), ~ !is.na(.))
    ))
  
  n  = nrow(dt)

  
  library(foreach); library(doParallel)
  registerDoParallel(ncore) # set the number of cores to use
  
  out = foreach(
    
    i         = (ini_win+h):n,
    .packages = c('tidyverse', 'zoo')
    
    ) %dopar% 
    
    {
      if (fixed_win == TRUE) {
        # use rolling window
        dt_train = dt[(i-ini_win-h+1):(i-h),]
      } else {
        # use expanding window
        dt_train = dt[1:(i-h),] # leave out h periods between train and test
      }
      
      x_train = dt_train[xvar] %>% as.matrix()
      y_train = dt_train[[yvar]]
      
      
      dt_test = dt[i,]
      x_test  = dt_test[xvar] %>% unlist()
      y_test  = dt_test[[yvar]]
      
      x_use = rbind(x_train, x_test)
      
      if (scale == TRUE) {
        x_use = scale(x_use)
      }
      
      x_train = x_use %>% head(-1)
      
      
      # predict y using historical mean
      y_bar  = mean(y_train)
      
      
      ##########################-
      ### predict y using pls ###
      ##########################-
      
      # first step: regress each x_train on y_train to get the coefficients
      pi = sapply(1:ncol(x_train),
                  function(i) coef( lm(x_train[,i] ~ y_train) )[2])
      
      
      # second step: at each time t, regress x on pi to obtain x_train_pls
      x_pls = sapply(1:nrow(x_use), 
                     function(i) coef(lm(x_use[i, ] ~ pi))[2])
      
      # moving averages
      x_pls = x_pls %>% 
        rollmean(k = ma_win, fill = NA, align = 'right')
      
      
      # third step: predict y using pls
      fit    = lm(y_train ~ x_pls %>% head(-1))
      y_hat  = fit$coefficients %*% c(1, x_pls %>% tail(1))
      
      
      # store both the pls weights and y values
      out = list(y = c(y_test, y_hat, y_bar), w = pi)
      return(out)
    }
  
  stopImplicitCluster()
  
  
  # extract pls weights
  w = out %>% lapply(`[[`, 2) %>% 
    do.call(what = rbind) %>%  
    as.data.frame() %>% 
    set_names(xvar)
  w = cbind(dt[(ini_win+h):n, 1], w) # get date
  
  # extract list of predictions
  y = out %>% lapply(`[[`, 1) %>%  
    do.call(rbind,.) %>% 
    data.frame() %>% 
    set_names(c('y_test', 'y_hat', 'y_bar'))

  y =  cbind(dt[(ini_win+h):n, 1], y) 
  
  # compute r2
  os_r2 = 1 - sum( (y$y_test - y$y_hat)^2 ) / sum( (y$y_test - y$y_bar)^2 )
  

  # final results to return
  out  = list(r2 = os_r2, y = y, w = w)
  return(out)
}



#-----------------------------------------------------------------------#
# FUNCTIONS TO DEAL WITH OS-R2 OUTPUT FROM PREVIOUS FUNCTIONS -----------
#-----------------------------------------------------------------------#

#---------------------------------------------------------#
# get os r2 over a given period --------------------------#
#---------------------------------------------------------#

fn_get_os_r2 = function(df, yr_range) {
  
  # This function computes the OOS R2 from return forecasts
  
  # INPUT:  
  #   + <df>       = data frame [year | y_test <actual yvar> | 
  #                             y_hat <yvar forecast with xvar> | 
  #                             y_bar <yvar foreast with historical mean>]
  #   + <yr_range> = range of years to compute OOS R2
  
  # OUTPUT: OO2 R2 * 100
  
  dt = df %>% filter(year %in% yr_range)
  
  os_r2 = 1 - sum((dt$y_test - dt$y_hat)^2) / sum((dt$y_test - dt$y_bar)^2)
  
  return( os_r2 * 100 )
}



#---------------------------------------------------------#
# Clark and West (2007) test -----------------------------#
#---------------------------------------------------------#

fn_cw_os_r2 = function(df, h, yr_range) {
  
  # This function computes the p-value of Clark and West (2007) test: os_r2 <= 0
  
  # INPUT:  
  #   + <df>       = data frame [year | y_test <actual yvar> | 
  #                             y_hat <yvar forecast with xvar> | 
  #                             y_bar <yvar foreast with historical mean>]
  #   + <h>        = number of overlapping periods in return forecasts
  #   + <yr_range> = range of years to compute OOS R2
  
  # OUTPUT: p-value
  
  library(lmtest)
  
  dt = df %>% filter(year %in% yr_range)
  
  f  = (dt$y_test - dt$y_bar)^2 - 
    ( (dt$y_test - dt$y_hat)^2 - (dt$y_bar - dt$y_hat)^2 )
  
  fit = lm(f ~ 1)
  fit_robust = coeftest(fit, vcov. = NeweyWest(fit, lag = h, prewhite = FALSE))
  
  t   = fit_robust[3] #summary(fit)$coefficients[3]
  p   = pnorm(t, lower.tail = FALSE)
  
  return(p)
}



#---------------------------------------------------------#
# add stars to the os r2 test ----------------------------#
#---------------------------------------------------------#

fn_star_os_r2 = function(df, h, yr_range) {
  
  # This function computes and adds stars to the OOS R2
  
  # NEED: <fn_get_os_r2> and <fn_cw_os_r2> and 
  
  # This function computes the p-value of Clark and West (2007) test: os_r2 <= 0
  
  # INPUT:  
  #   + <df>       = data frame [year | y_test <actual yvar> | 
  #                             y_hat <yvar forecast with xvar> | 
  #                             y_bar <yvar foreast with historical mean>]
  #   + <h>        = number of overlapping periods in return forecasts
  #   + <yr_range> = range of years to compute OOS R2
  
  # OUTPUT: p-value
  
  # compute os_r2
  os_r2 = fn_get_os_r2(df = df, yr_range = yr_range)
  
  # compute os test
  p = fn_cw_os_r2(df = df, h = h, yr_range = yr_range)
  
  if (p <= 0.01) {
    out = paste0(os_r2, " ***")
  } else if (p <= 0.05) {
    out = paste0(os_r2, ' **')
  } else if (p <= 0.1) {
    out = paste0(os_r2, ' *')
  } else {
    out = os_r2
  }
  
  return(out)
}



#---------------------------------------------------------#
# difference in cumulative sum of square errors -----------#
#---------------------------------------------------------#

fn_cum_os_r2 = function(df, year) {
  # compute the difference in cumulative sse of historical y and predicted y
  # a value > 0 indicates that the predictive y outperforms upto that point
  
  dt = df %>% filter(year >= year)
  
  cum_r2 = (cumsum((dt$y_test - dt$y_bar)^2) - cumsum((dt$y_test - dt$y_hat)^2)) /
    sum((dt$y_test - dt$y_bar)^2)
  dt$cum_r2 = cum_r2
  
  return(dt)
}